package com.example.demo.service;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 选择器提取内容网页爬虫
 */
public class SelectSpiderService implements PageProcessor {

    // 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等
    private Site site = Site.me()
            // 重试次数
            .setCycleRetryTimes(3)
            // 抓取间隔 ms
            .setSleepTime(1000);


    @Override
    // process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑
    //  page 爬虫执行后返回的页面信息
    public void process(Page page) {
        // 获取page中的静态页面代码
        Html html = page.getHtml();
        // 1、选取 div标签 class=box-content 的元素
        // 2、选取当前div中的所有 li标签
        List<Selectable> selectables = html.$("div[class=box-content]").$("li").nodes();
        // 选择当前div的所有li标签
        List<Map<String, String>> hrefs = new ArrayList<>();
        selectables.forEach(item -> {
            Map<String, String> map = new HashMap<>();
            // 链接
            String href = item.$("a","href").get();
            map.put("href",href);
            // 标题 text 获取标签里的内容
            String title = item.$("span","text").get();
            map.put("title",title);
            // 文章发布时间
            String date = item.$("em","text").get();
            map.put("date",date);
            // 文章摘要
            String abstracts = item.$("p","text").get();
            map.put("abstracts",abstracts);
            System.out.println(date + " " + title + " " + href + " " + abstracts);
            hrefs.add(map);
        });
        page.putField("author","wbx");
        page.putField("articles",hrefs);
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
        httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("221.5.80.66",3128)));
        Spider.create(new SelectSpiderService())
                .addUrl("http://www.ccpit-henan.org/mcxw/index.jhtml")
                // 写出为JSON格式
                .addPipeline(new JsonFilePipeline("D:\\webmagic\\"))
                .thread(5)
//                .setDownloader(httpClientDownloader)
                .run();
    }
}
